library(dada2)
packageVersion("dada2") # check dada2 version
## [1] '1.18.0'
library(Biostrings)
library(ShortRead)
library(seqTools) # per base sequence content
library(phyloseq)
library(ggplot2)
library(data.table)
library(plyr)
library(dplyr)
library(qckitfastq) # per base sequence content
library(stringr)
First, we import the fastq files containing the raw reads. The samples were downloaded from the SRA database with the accession number PRJEB11419. IBS and healthy samples were selected out of the thousands of samples as described in 00_Metadata-AGP.R.
# Save the path to the directory containing the fastq zipped files
path <- "~/Projects/IBS_Meta-analysis_16S/data/analysis-individual/AGP"
# list.files(path) # check we are in the right directory
# fastq filenames have format: SAMPLENAME.fastq.gz
# Saves the whole directory path to each file name
FNs <- sort(list.files(file.path(path, "original"), pattern=".fastq.gz", full.names = TRUE))
show(FNs[1:5])
## [1] "/Users/scarcy/Projects/IBS_Meta-analysis_16S/data/analysis-individual/AGP/original/ERR1073039_1.fastq.gz"
## [2] "/Users/scarcy/Projects/IBS_Meta-analysis_16S/data/analysis-individual/AGP/original/ERR1073655_1.fastq.gz"
## [3] "/Users/scarcy/Projects/IBS_Meta-analysis_16S/data/analysis-individual/AGP/original/ERR1073725_1.fastq.gz"
## [4] "/Users/scarcy/Projects/IBS_Meta-analysis_16S/data/analysis-individual/AGP/original/ERR1074570_1.fastq.gz"
## [5] "/Users/scarcy/Projects/IBS_Meta-analysis_16S/data/analysis-individual/AGP/original/ERR1074573_1.fastq.gz"
# Extract sample names, assuming filenames have format: SAMPLENAME.fastq
sample.names <- sapply(strsplit(basename(FNs), "_"), `[`, 1)
show(sample.names[1:5]) # saves only the file name (without the path)
## [1] "ERR1073039" "ERR1073655" "ERR1073725" "ERR1074570" "ERR1074573"
# Look at quality of all files
for (i in 1:4){ # 1:length(FNs)
show(plotQualityProfile(FNs[i]))
}
# Look at nb of reads per sample
# raw_stats <- data.frame('sample' = sample.names,
# 'reads' = fastqq(FNs)@nReads)
min(raw_stats$reads)
## [1] Inf
max(raw_stats$reads)
## [1] -Inf
mean(raw_stats$reads)
## [1] NA
# hist(log10(raw_stats$reads))
We will have a quick peak at the per base sequence content of the reads in some samples, to make sure there is no anomaly (i.e. all reads having the same sequence).
# Look at per base sequence content
fseq <- seqTools::fastqq(FNs[4])
## [fastqq] File ( 1/1) '/Users/scarcy/Projects/IBS_Meta-analysis_16S/data/analysis-individual/AGP/original/ERR1074570_1.fastq.gz' done.
rc <- read_content(fseq)
plot_read_content(rc) + labs(title = "Per base sequence content")
plot_read_content(rc) + xlim(0,50) + labs(title = "Per base sequence content")
Now, we will look whether the reads still contain the primers.
# Import metadata, that contains the primer sequences
metadata <- read.csv(file.path(path, "Metadata-AGP.csv"))
table(metadata$pcr_primers..exp.)
##
## FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT
## 45
## FWD:GTGYCAGCMGCCGCGGTAA; REV:GGACTACNVGGGTWTCTAAT
## 1245
# V4
FWD <- "GTGYCAGCMGCCGCGGTAA" # 515F primer sequence
REV <- "GGACTACHVGGGTWTCTAAT" # 806R primer sequence
# Function that, from the primer sequence, will return all combinations possible (complement, reverse complement, ...)
allOrients <- function(primer) {
# Create all orientations of the input sequence
require(Biostrings)
dna <- DNAString(primer) # The Biostrings works w/ DNAString objects rather than character vectors
orients <- c(Forward = dna, Complement = complement(dna), Reverse = reverse(dna),
RevComp = reverseComplement(dna))
return(sapply(orients, toString)) # Convert back to character vector
}
# Get all combinations of the primer sequences
FWD.orients <- allOrients(FWD) # 515F
REV.orients <- allOrients(REV) # 806R
FWD.orients # sanity check
## Forward Complement Reverse
## "GTGYCAGCMGCCGCGGTAA" "CACRGTCGKCGGCGCCATT" "AATGGCGCCGMCGACYGTG"
## RevComp
## "TTACCGCGGCKGCTGRCAC"
REV.orients
## Forward Complement Reverse
## "GGACTACHVGGGTWTCTAAT" "CCTGATGDBCCCAWAGATTA" "TAATCTWTGGGVHCATCAGG"
## RevComp
## "ATTAGAWACCCBDGTAGTCC"
# Function that counts number of reads in which a sequence is found
primerHits <- function(primer, fn) {
# Counts number of reads in which the primer is found
nhits <- vcountPattern(primer, sread(readFastq(fn)), fixed = FALSE, max.mismatch = 2)
return(sum(nhits > 0))
}
# Get a table to know how many times the 515F and 806R primers are found in the reads of each sample
for (i in 1:4){ # 1:length(FNs)
cat("SAMPLE", sample.names[i], "with total number of", raw_stats[i,'reads'], "reads\n\n")
x <- rbind(ForwardPrimer = sapply(FWD.orients, primerHits, fn = FNs[[i]]),
ReversePrimer = sapply(REV.orients, primerHits, fn = FNs[[i]]))
print(x)
cat("\n____________________________________________\n\n")
}
## SAMPLE ERR1073039 with total number of reads
##
## Forward Complement Reverse RevComp
## ForwardPrimer 0 0 0 0
## ReversePrimer 0 0 0 4
##
## ____________________________________________
##
## SAMPLE ERR1073655 with total number of reads
##
## Forward Complement Reverse RevComp
## ForwardPrimer 0 0 0 0
## ReversePrimer 0 0 0 33
##
## ____________________________________________
##
## SAMPLE ERR1073725 with total number of reads
##
## Forward Complement Reverse RevComp
## ForwardPrimer 0 0 0 0
## ReversePrimer 0 0 0 8
##
## ____________________________________________
##
## SAMPLE ERR1074570 with total number of reads
##
## Forward Complement Reverse RevComp
## ForwardPrimer 5 0 0 0
## ReversePrimer 2 0 0 1600
##
## ____________________________________________
The reads do not contain any primer. We will go forward directly with the quality filtering.
Then, we perform a quality filtering of the reads.
# Place filtered files in a filtered/ subdirectory
filt2_samples <- file.path(path, "filtered2", paste0(sample.names, "_filt.fastq.gz"))
# Assign names for the filtered fastq.gz files
names(filt2_samples) <- sample.names
out2 <- filterAndTrim(fwd = FNs, filt = filt2_samples,
maxEE=4, # reads with more than 4 expected errors (sum(10e(-Q/10))) are discarded
truncQ=10, # Truncate reads at the first instance of a quality score less than or equal to truncQ.
minLen=150, # Discard reads shorter than 200 bp. This is done after trimming and truncation.
compress=TRUE, # Output files in gzip format
verbose=TRUE) # give a message saying the percentage of filtered sequences
# Update list of filtered files
# FNs_filt <- file.path(path, "filtered2", paste0(sample.names, "_filt.fastq.gz"))
FNs_filt <- sort(list.files(file.path(path, "filtered2"), pattern=".fastq.gz", full.names = TRUE))
sample.names_filt <- sapply(strsplit(basename(FNs_filt), "_"), `[`, 1)
names(FNs_filt) <- sample.names_filt
About ~35 samples had a large number of reads but none of them passed the quality filter. Let’s inspect quickly the quality profile of some of them.
for (i in 195:234){ # printing 2 samples before & after the "abnormal samples", that actually passed the quality filter (as comparison)
show(plotQualityProfile(FNs[i]))
}
The quality profile of these discarded samples show some abnormalities, and reads were also shorter than 150bp. It is not surprising none of the reads passed the quality filtering.
Let’s look at the output filtered fastq files as sanity check.
out2[1:4,] # show how many reads were filtered in each file
## reads.in reads.out
## ERR1073039_1.fastq.gz 17661 16321
## ERR1073655_1.fastq.gz 53710 53268
## ERR1073725_1.fastq.gz 26291 26120
## ERR1074570_1.fastq.gz 27191 24237
# Look at quality profile of filtered files
for (i in 1:4){
show(plotQualityProfile(filt2_samples[i]))
}
Now we will build the parametric error model, to be able to infer amplicon sequence variants (ASVs) later on.
err <- learnErrors(FNs_filt, multithread=TRUE, randomize=TRUE, verbose = 1)
The error rates for each possible transition (A→C, A→G, …) are shown. Points are the observed error rates for each consensus quality score. The black line shows the estimated error rates after convergence of the machine-learning algorithm. The red line shows the error rates expected under the nominal definition of the Q-score. Here the estimated error rates (black line) are a good fit to the observed rates (points), and the error rates drop with increased quality as expected.
plotErrors(err, nominalQ = TRUE)
The dada() algorithm infers sequence variants based on estimated errors (previous step). Firstly, we de-replicate the reads in each sample, to reduce the computation time. De-replication is a common step in almost all modern ASV inference (or OTU picking) pipelines, but a unique feature of derepFastq is that it maintains a summary of the quality information for each dereplicated sequence in $quals.
# Prepare empty vector for the infered sequences
seq_infered <- vector("list", length(sample.names_filt))
names(seq_infered) <- sample.names_filt # names of each row will be the sample names
# Iterate through the 1242 samples
for(sampl in sample.names_filt) {
cat("Processing:", sampl, "\n") # print which sample is being processed
derep <- derepFastq(FNs_filt[[sampl]]) # dereplicate the reads in the sample
seq_infered[[sampl]] <- dada(derep, err=err, multithread=TRUE) # default parameters
}
# Inspect the infered sequence variants from sample 1:5
for (i in 1:5){
print(seq_infered[[i]])
print("________________")
}
## dada-class: object describing DADA2 denoising results
## 164 sequence variants were inferred from 3842 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## [1] "________________"
## dada-class: object describing DADA2 denoising results
## 127 sequence variants were inferred from 4897 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## [1] "________________"
## dada-class: object describing DADA2 denoising results
## 270 sequence variants were inferred from 6218 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## [1] "________________"
## dada-class: object describing DADA2 denoising results
## 213 sequence variants were inferred from 6000 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## [1] "________________"
## dada-class: object describing DADA2 denoising results
## 318 sequence variants were inferred from 6087 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## [1] "________________"
We can now construct an amplicon sequence variant table (ASV) table, a higher-resolution version of the OTU table produced by traditional methods.
# Make sequence table from the infered sequence variants
seqtable <- makeSequenceTable(seq_infered)
# We should have 1242 samples (1242 rows)
dim(seqtable)
## [1] 1242 27790
# Inspect distribution of sequence lengths
hist(nchar(getSequences(seqtable)), breaks = 100, xlab = "ASV length", ylab = "Number of ASVs", main="")
The core dada method corrects substitution and indel errors, but chimeras remain. Fortunately, the accuracy of sequence variants after denoising makes identifying chimeric ASVs simpler than when dealing with fuzzy OTUs. Chimeric sequences are identified if they can be exactly reconstructed by combining a left-segment and a right-segment from two more abundant “parent” sequences.
seqtable.nochim <- removeBimeraDenovo(seqtable, method="consensus", multithread=TRUE, verbose=TRUE)
# Check how many sequence variants we have after removing chimeras
dim(seqtable.nochim)
## [1] 1242 18431
# Check how many reads we have after removing chimeras (we should keep the vast majority of the reads, like > 80%)
sum(seqtable.nochim)/sum(seqtable)
## [1] 0.9553288
Sanity check before assigning taxonomy.
# Function that counts nb of reads
getN <- function(x) sum(getUniques(x))
# Table that will count number of reads for each process of interest (input reads, filtered reads, denoised reads, non chimera reads)
rownames(out2) <- sapply(strsplit(rownames(out2), "_"), `[`, 1)
track <- out2 %>%
as.data.frame() %>%
mutate(Run=rownames(out2)) %>%
rename("input"="reads.in",
"quality-filt"="reads.out") %>%
left_join(sapply(seq_infered, getN) %>%
as.data.frame %>%
rename(denoised=".") %>%
mutate(Run=names(sapply(seq_infered, getN))),
by="Run") %>%
left_join(rowSums(seqtable.nochim) %>%
as.data.frame %>%
rename(nonchim=".") %>%
mutate(Run=rownames(seqtable.nochim)),
by="Run") %>%
relocate(Run) %>%
mutate("%kept_input->output"=as.integer(nonchim*100/input))
# Show final table: for each row/sample, we have shown the initial number of reads, filtered reads, denoised reads, and non chimera reads
track
Extensions: The dada2 package also implements a method to make species level assignments based on exact matching between ASVs and sequenced reference strains. Recent analysis suggests that exact matching (or 100% identity) is the only appropriate way to assign species to 16S gene fragments. Currently, species-assignment training fastas are available for the Silva and RDP 16S databases. To follow the optional species addition step, download the silva_species_assignment_v132.fa.gz file, and place it in the directory with the fastq files.
# Assign taxonomy (with silva v138)
taxa <- assignTaxonomy(seqtable.nochim, "/Users/scarcy/Desktop/Praktikum/Data/silva_nr_v138_train_set.fa.gz",
tryRC = TRUE, # try reverse complement of the sequences
multithread=TRUE, verbose = TRUE)
# Add species assignment
taxa <- addSpecies(taxa, "/Users/scarcy/Desktop/Praktikum/Data/silva_species_assignment_v138.fa.gz")
# Check how the taxonomy table looks like
taxa.print <- taxa
rownames(taxa.print) <- NULL # Removing sequence rownames for display only
head(taxa.print)
## Kingdom Phylum Class Order
## [1,] "Bacteria" "Proteobacteria" "Gammaproteobacteria" "Enterobacterales"
## [2,] "Bacteria" "Proteobacteria" "Gammaproteobacteria" "Enterobacterales"
## [3,] "Bacteria" "Bacteroidota" "Bacteroidia" "Bacteroidales"
## [4,] "Bacteria" "Bacteroidota" "Bacteroidia" "Bacteroidales"
## [5,] "Bacteria" "Bacteroidota" "Bacteroidia" "Bacteroidales"
## [6,] "Bacteria" "Firmicutes" "Clostridia" "Oscillospirales"
## Family Genus Species
## [1,] "Enterobacteriaceae" "Escherichia/Shigella" NA
## [2,] "Enterobacteriaceae" "Escherichia/Shigella" NA
## [3,] "Bacteroidaceae" "Bacteroides" "vulgatus"
## [4,] "Bacteroidaceae" "Bacteroides" "vulgatus"
## [5,] "Bacteroidaceae" "Bacteroides" "uniformis"
## [6,] "Ruminococcaceae" "Faecalibacterium" NA
table(taxa.print[,1]) # Show the different kingdoms
##
## Archaea Bacteria Eukaryota
## 47 16910 1076
table(taxa.print[,2]) # Show the different phyla
##
## Abditibacteriota Acidobacteriota Actinobacteriota Ascomycota
## 2 27 801 1
## Bacteroidota Basidiomycota Bdellovibrionota Campilobacterota
## 2396 9 1 30
## Chloroflexi Crenarchaeota Cyanobacteria Deinococcota
## 8 6 143 18
## Desulfobacterota Elusimicrobiota Entotheonellaeota Euglenozoa
## 137 6 1 37
## Euryarchaeota Firmicutes Fusobacteriota Gemmatimonadota
## 21 11373 70 2
## Halobacterota Hydrogenedentes Incertae_Sedis Methylomirabilota
## 1 1 20 1
## Mucoromycota Myxococcota Nitrospirota Patescibacteria
## 8 6 1 13
## Planctomycetota Proteobacteria RCP2-54 Spirochaetota
## 17 1331 2 10
## Synergistota Thermoplasmatota Verrucomicrobiota Vertebrata
## 30 19 140 1
## WOR-1
## 1
table(is.na(taxa.print[,2])) # is there any NA phyla?
##
## FALSE TRUE
## 16691 1740
We will remove any sample with less than 500 reads from further analysis, and also any ASVs with unassigned phyla.
The preprocessing will be easier to do with ASV, taxonomic and metadata tables combined in a phyloseq object.
#_________________________
# Import metadata
metadata <- read.csv(file.path(path, "Metadata-AGP.csv"))
# Keep relevant covariates
sampledf <- metadata %>%
select(-"pcr_primers..exp.")
rownames(sampledf) <- sampledf$Run
sampledf$author <- "AGP"
sampledf$sequencing_tech <- "Illumina"
sampledf$variable_region <- "V4"
#_________________________
# Create phyloseq object
physeq <- phyloseq(otu_table(seqtable.nochim, taxa_are_rows=FALSE), # by default, in otu_table the sequence variants are in rows
sample_data(sampledf),
tax_table(taxa))
# Remove samples with less than 500 reads
physeq <- prune_samples(sample_sums(physeq)>=500, physeq)
# Remove taxa that are eukaryota, or have unassigned Phyla
physeq <- subset_taxa(physeq, Kingdom != "Eukaryota")
physeq <- subset_taxa(physeq, !is.na(Phylum))
physeq <- filter_taxa(physeq, function(x) sum(x > 0) > 0, TRUE) # keep ASVs present in at least 1 sample
physeq.filt <- filter_taxa(physeq, function(x) sum(x > 0) >= 2, TRUE)
# Save to disk
saveRDS(raw_stats, "~/Projects/IBS_Meta-analysis_16S/data/analysis-individual/AGP/01_Dada2-AGP/raw_stats.rds")
saveRDS(out2, "~/Projects/IBS_Meta-analysis_16S/data/analysis-individual/AGP/01_Dada2-AGP/out2.rds")
saveRDS(err, "~/Projects/IBS_Meta-analysis_16S/data/analysis-individual/AGP/01_Dada2-AGP/err.rds")
saveRDS(seq_infered, "~/Projects/IBS_Meta-analysis_16S/data/analysis-individual/AGP/01_Dada2-AGP/seq_infered.rds")
saveRDS(otu_table(physeq), "~/Projects/IBS_Meta-analysis_16S/data/analysis-individual/AGP/01_Dada2-AGP/ASVtable_final.rds") # ASV table
saveRDS(tax_table(physeq), "~/Projects/IBS_Meta-analysis_16S/data/analysis-individual/AGP/01_Dada2-AGP/taxa_final.rds") # taxa table
saveRDS(sample_data(physeq), "~/Projects/IBS_Meta-analysis_16S/data/analysis-individual/AGP/01_Dada2-AGP/metadata_final.rds") # metadata table
write.csv(sample_data(physeq), "~/Projects/IBS_Meta-analysis_16S/analysis-individual/AGP/Metadata-AGP.csv")
saveRDS(physeq, "~/Projects/IBS_Meta-analysis_16S/data/analysis-individual/AGP/01_Dada2-AGP/physeq_AGP.rds") # phyloseq object
saveRDS(physeq.filt, "~/Projects/IBS_Meta-analysis_16S/data/analysis-individual/AGP/01_Dada2-AGP/physeq(ASVfilt)_AGP.rds")
# Save nb of reads per sample before / after dada2 pipeline
nb.reads <- readRDS("~/Projects/IBS_Meta-analysis_16S/data/analysis-individual/nb_reads.rds")
nb.reads[1:1191, 'AGP_before'] <- fastqq(FNs)@nReads
nb.reads[1:1191, 'AGP_after'] <- rowSums(seqtable.nochim)
median(na.omit(nb.reads$AGP_before)) # sanity check
median(na.omit(nb.reads$AGP_after)) # sanity check
saveRDS(nb.reads, "~/Projects/IBS_Meta-analysis_16S/data/analysis-individual/nb_reads.rds")
# Absolute abundance
# plot_bar(physeq, fill = "Phylum")+ facet_wrap("host_disease", scales="free") + theme(axis.text.x = element_blank()) # long to compute
# Relative abundance for Phylum
phylum.table <- physeq %>%
tax_glom(taxrank = "Phylum") %>% # agglomerate at phylum level
transform_sample_counts(function(x) {x/sum(x)} ) %>% # Transform to rel. abundance
psmelt() # Melt to long format
ggplot(phylum.table, aes(x = reorder(Sample, Sample, function(x) mean(phylum.table[Sample == x & Phylum == 'Bacteroidota', 'Abundance'])),
y = Abundance, fill = Phylum))+
facet_wrap(~ host_disease, scales = "free") + # scales = "free" removes empty lines
geom_bar(stat = "identity") +
theme(axis.text.x = element_blank())+
labs(x = "Samples", y = "Relative abundance")